##########################################################
### Replication Materials for
### Stefan Müller, Garrett Kennedy, and Tomas Maher:
### Reactions to Experts in Deliberative Democracy: The 2016-2018 Irish Citizens' Assembly
### Irish Political Studies
### 
### Please get in touch with the authors if you have any questions: 
### stefan.mueller@ucd.ie
### You find detailed instructions on the replication materials 
### in the file 0000_readme.pdf
##########################################################
##########################################################

## Reproduce analyses based on corpus split into chunks

# Load Relevant Libraries 

library(quanteda)      # CRAN v3.3.0
library(tidyverse)     # CRAN v1.3.2
library(stringr)       # CRAN v1.5.0
library(forcats)       # CRAN v1.0.0
library(stm)           # CRAN v1.3.6
library(cowplot)       # CRAN v1.1.1 # CRAN v1.1.1
library(furrr)         # CRAN v0.3.1
library(ggeffects)     # CRAN v1.1.5
library(texreg)        # CRAN v1.38.6
library(xtable)        # CRAN v1.8-4

# load custom ggplot2 scheme

theme_baser <- function (){
    theme_minimal()  %+replace%
        theme(panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.major.y = element_blank(),
              panel.border = element_rect(fill = NA,colour = "black", linewidth = 0.5,
                                          linetype = "solid"),
              legend.title = element_text(size = 15),
              plot.caption = element_text(colour = "grey30", size = 11, hjust = 1),
              plot.title = element_text(size = 15, hjust = 0.5, face = "bold",
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              legend.position = "bottom",
              axis.ticks.y = element_line(size = 0.3),
              axis.ticks.x = element_line(size = 0.3),
              axis.ticks.length = unit(0.2, "cm"),
              legend.text=element_text(size = 13),
              panel.background = element_rect(fill='white'), # transparent panel bg
              plot.background = element_rect(fill='white', color= "white"), # transparent plot bg
              strip.text = element_text(size = 15, hjust = 0.5, face = "bold",
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              axis.text.y = element_text(colour = "black", size = 13, hjust = 1),
              axis.text.x = element_text(colour = "black", size = 13),
              axis.title = element_text(size = 13, hjust = 0.5))
}



# set theme
theme_set(theme_baser())


# function for predicted probability plots
plot_continuous <- function(x, 
                            xlab = "Topic Prevalence of Input",
                            ylab = "Predicted Maximum Topic Prevalence\nAcross Subsequent Q&A Sessions") {
    
    data <- x
    
    data$group_factor <- factor(data$group, levels = c("Expert Input", "Other Agenda Item"))
    
    ggplot(data = data, aes(x = x, y = predicted,
                            ymin = conf.low, ymax = conf.high)) +
        geom_ribbon(fill = "grey80") +
        geom_line(linewidth = 0.8) +
        facet_wrap(~group_factor) +
        scale_y_continuous(breaks = c(seq(0, 7, 0.1))) +
        scale_x_continuous(breaks = c(seq(0, 1, 0.2))) +
        labs(x = xlab,
             y = ylab)
    
}




plot_discrete <- function(x, 
                          xlab = "Topic Prevalence of Input",
                          ylab = "Predicted Maximum Topic Prevalence\nAcross Subsequent Q&A Sessions") {
    
    data <- x
    
    data$group_factor <- factor(data$group, levels = c("Expert Input", "Other Agenda Item"))
    
    ggplot(data = data, aes(x = x, y = predicted,
                            ymin = conf.low, 
                            ymax = conf.high)) +
        geom_point(size =3) +
        geom_linerange() +
        facet_wrap(~group_factor) +
        scale_y_continuous(breaks = c(seq(0, 7, 0.1))) +
        labs(x = xlab,
             y = ylab)
    
}


# Prepare Data----

# Import data
dat_raw <- readRDS("data_deliberation_ireland.rds")

# Remove NA's
dat_raw_text <- subset(dat_raw, !is.na(text))

# Import reference dataset

dat_ref <- readRDS("topics_ref.rds") 

# Merge both datasets

dat_ref <- dat_ref %>% 
  select(-c("date", "assembly"))

names(dat_ref)[c(2, 3)] <- c("order", "group")


dat_coded <- left_join(dat_raw_text, dat_ref, by = "title_youtube") %>% 
  filter(docname != "Citizens' Assembly 2017-07-09 1") 

dat_coded$date[dat_coded$order == 232] <- "2017-07-09"

# Create expert variable

dat_coded <- dat_coded %>%
  mutate(type = replace_na(type, "Information Session")) %>% 
  mutate(session_type = ifelse(type == "Q&A" | substr(type, 1, 8) == "Feedback", "Q&A",
                               ifelse(substr(position_short, 1, 6) == "Expert", "Expert", "Other"))) %>% 
  mutate(session_type = replace_na(session_type, "Other"))


# Clean the data 

# cleaning the text
dat_coded$text <- str_to_lower(dat_coded$text)
dat_coded$text <- str_replace_all(dat_coded$text, c("roach" = "roche", 
                                                    "bala" = "ballot",
                                                    "tea shock" = "taoiseach",
                                                    "kylie" = "kindly", 
                                                    "ballast" = "ballots", 
                                                    "arrakis" = "oireachtas", 
                                                    "approche" = "approach", 
                                                    "laphroaig" = "laffoy", 
                                                    "coffee" = "", 
                                                    "sheehan" = "",
                                                    "depressiaon" = "depression",
                                                    "volts" = "votes", 
                                                    "yeah" = "", 
                                                    "table" = "", 
                                                    " one " = " ", 
                                                    "question" = "",
                                                    "questions" = "",
                                                    " yes " = " ",
                                                    "commissiaon" = "commission",
                                                    " no " = " ", 
                                                    "\\-" = "", 
                                                    "\\\\" = "", 
                                                    "ssia" = "ssi",
                                                    "depressiave" = "depressive", 
                                                    "matter and" = "met eireann", 
                                                    "my turn" = "met eireann", 
                                                    "met her and her" = "met eireann", 
                                                    "my sharon\\'s" = "met eireann", 
                                                    "matheran" = "met eireann", 
                                                    "metron" = "met eireann",
                                                    "windham" = "winter",
                                                    "huracan" = "hurricane", 
                                                    "greenness got" = "greenhouse gases but", 
                                                    "clutter" = "climate", 
                                                    "metaphase" = "met office", 
                                                    "float" = "flood", 
                                                    "weatr" = "weather",
                                                    "submissiaons" = "submissions",
                                                    " s " = " ",
                                                    "sessiaon" = "session"
))



# Create Corpus

corp <- corpus(dat_coded, docid_field = "docname")

# Get length of sessions

docvars(corp, "session_length") <- ntoken(corp)
docvars(corp, "session_length_no_punct") <- ntoken(corp, remove_punct = TRUE)


corp_relevant <- corpus_subset(corp, group %in% c("8th Amendment",
                                                  "Referenda",
                                                  "Ageing Population",
                                                  "Climate Change"))



# Filter corpora into subsets

# Eighth
# Filter to 8th Amendment
corp_eighth <- corpus_subset(corp, group == "8th Amendment")

# Referenda 

# Filter to Referenda
corp_referenda <- corpus_subset(corp, group == "Referenda")

# Ageing

# Filter to Ageing Population
corp_ageing <- corpus_subset(corp, group == "Ageing Population")

# Climate

corp_climate <- corpus_subset(corp, group == "Climate Change")


# Create DFMs
# Eighth

tok_eighth <- tokens(corp_eighth)

tok_eighth_chunk <- tokens_chunk(tok_eighth, size = 500)

dfm_eighth_raw <- dfm(tok_eighth_chunk)

docvars(dfm_eighth_raw, "chunk_length") <- ntoken(dfm_eighth_raw)

dfm_eighth_raw <- dfm_remove(dfm_eighth_raw, pattern = stopwords("en"))

# Referenda

tok_referenda <- tokens(corp_referenda)

tok_referenda_chunk <- tokens_chunk(tok_referenda, size = 500)

dfm_referenda_raw <- dfm(tok_referenda_chunk)

docvars(dfm_referenda_raw, "chunk_length") <- ntoken(dfm_referenda_raw)

dfm_referenda_raw <- dfm_remove(dfm_referenda_raw, pattern = stopwords("en"))

# Ageing

tok_ageing <- tokens(corp_ageing)

tok_ageing_chunk <- tokens_chunk(tok_ageing, size = 500)

dfm_ageing_raw <- dfm(tok_ageing_chunk)

docvars(dfm_ageing_raw, "chunk_length") <- ntoken(dfm_ageing_raw)

dfm_ageing_raw <- dfm_remove(dfm_ageing_raw, pattern = stopwords("en"))

# Climate

tok_climate <- tokens(corp_climate)

tok_climate_chunk <- tokens_chunk(tok_climate, size = 500)

dfm_climate_raw <- dfm(tok_climate_chunk)

docvars(dfm_climate_raw, "chunk_length") <- ntoken(dfm_climate_raw)

dfm_climate_raw <- dfm_remove(dfm_climate_raw, pattern = stopwords("en"))



# Topic Models----


# Load library, set seed and trim data

set.seed(123)

# Eighth

# trim the documents  to remove unnecessary tokens
dfm_eighth_topics <- dfm_trim(dfm_eighth_raw, 
                              min_termfreq = 0.8, termfreq_type = "quantile", 
                              max_docfreq = 0.1, docfreq_type = "prop") %>% 
  dfm_keep(min_nchar = 2)

topfeatures(dfm_eighth_topics)

# Ageing

dfm_ageing_topics <- dfm_trim(dfm_ageing_raw, 
                              min_termfreq = 0.1, termfreq_type = "quantile", 
                              max_docfreq = 0.6, docfreq_type = "prop") %>% 
  dfm_keep(min_nchar = 2)

# Climate

dfm_climate_topics <- dfm_trim(dfm_climate_raw, 
                               min_termfreq = 0.9, termfreq_type = "quantile", 
                               max_docfreq = 0.5, docfreq_type = "prop") %>% 
  dfm_keep(min_nchar = 2)



# Convert to stm 

# Eighth

# filter out "Citizens' Assembly 2017-01-07 12" because it was dropped when making the STM
dfm_eighth_temp <- dfm_subset(dfm_eighth_topics, docname_ != "Citizens' Assembly 2017-02-04 2.9")

# convert the dfm to a format used for stm
dfm_eighth_topics_stm <- quanteda::convert(dfm_eighth_temp, to = "stm")

# Referenda

dfm_referenda_topics <- dfm_referenda_raw %>% 
  dfm_keep(min_nchar = 2)

# convert the dfm to a format used for stm
dfm_referenda_topics_stm <- quanteda::convert(dfm_referenda_topics, to = "stm")

# Ageing

dfm_ageing_topics_stm <- quanteda::convert(dfm_ageing_topics, to = "stm")

# Climate

dfm_climate_topics_stm <- quanteda::convert(dfm_climate_topics, to='stm')

# Figure A9 ----

# Generate Diagnostic Values to Pick Topic Numbers

plan(multisession)



# Generate Diagnostic Values to Pick Topic Numbers


# write function based on tutorial by Julia Silge for assessing different numbers ot topics
# Note: the function validate_stm() takes an stm object as 
# the input, and the user specifies the range of topics to 
# be validated. We set seeds to ensure reproducibility


# link to original tutorial: https://juliasilge.com/blog/evaluating-stm/

plan(multisession)

validate_stm <- function(dfmat_stm, ks) {
    
    many_models <- tibble(K = ks ) %>%
        mutate(topic_model = future_map(K, ~stm(documents = dfmat_stm$documents, 
                                                vocab = dfmat_stm$vocab, 
                                                data = dfmat_stm$meta,
                                                K = .,
                                                seed = 1254, 
                                                verbose = TRUE),
                                        .options = furrr_options(seed = TRUE)))
    
    
    heldout <- make.heldout(documents = dfmat_stm$document,
                            vocab = dfmat_stm$vocab, 
                            seed = 123)
    
    k_result <- many_models %>%
        mutate(exclusivity = map(topic_model, exclusivity),
               semantic_coherence = map(topic_model, semanticCoherence, 
                                        documents = dfmat_stm$documents),
               eval_heldout = map(topic_model, eval.heldout, heldout$missing),
               residual = map(topic_model, checkResiduals, dfmat_stm$documents),
               bound =  map_dbl(topic_model, function(x) max(x$convergence$bound)),
               lfact = map_dbl(topic_model, function(x) lfactorial(x$settings$dim$K)),
               lbound = bound + lfact,
               iterations = map_dbl(topic_model, function(x) length(x$convergence$bound)))
    
    
}

plot_validate_stm <- function(x, title, topics_selected) {
    
    p_validate <- x %>%
        transmute(K,
                  `Lower Bound` = lbound,
                  Residuals = map_dbl(residual, "dispersion"),
                  `Semantic Coherence` = map_dbl(semantic_coherence, mean),
                  `Held-out Likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
        gather(Metric, Value, -K) %>%
        ggplot(aes(K, Value)) +
        geom_line(size = 1.05, colour = "grey50", show.legend = FALSE) +
        facet_wrap(~Metric, scales = "free_y") +
        geom_vline(xintercept =  topics_selected, linetype = "dashed", colour = "red", size = 0.8) +
        theme(strip.background = element_blank(),
              strip.text.x = element_text(size = 13),
              plot.title = element_text(size = 15, face = "bold", hjust=  0.5)) +
        labs(x = "K (Number of Topics)",
             title = title,
             y = NULL)
    print(p_validate)
}



ks_chunks <- seq(4, 25, 1)


# Climate change

# k_eight = 19
# k_referenda <- 8
# k_ageing <- 10
# k_climate <- 12

models_climate <- validate_stm(dfmat_stm = dfm_climate_topics_stm,
                               ks = ks_chunks)

p_climate <- plot_validate_stm(models_climate, title = "Climate Change",
                               topics_selected = 12)


# Ageing population

models_ageing <- validate_stm(dfmat_stm = dfm_ageing_topics_stm,
                              ks = ks_chunks)

p_ageing <- plot_validate_stm(models_ageing, 
                              title = "Ageing Population",
                              topics_selected = 10)


# Abortion 

models_abortion <- validate_stm(dfmat_stm = dfm_eighth_topics_stm,
                                ks = ks_chunks)


p_abortion <- plot_validate_stm(models_abortion, 
                                title = "Abortion (Eighth Amendment)",
                                topics_selected = 19)


# Referenda

models_referenda <- validate_stm(dfmat_stm = dfm_referenda_topics_stm,
                                 ks = ks_chunks)


p_referenda <- plot_validate_stm(models_referenda, 
                                 title = "Referenda",
                                 topics_selected = 8)



# combine plots
plot_grid(p_abortion,
          p_ageing,
          p_climate,
          p_referenda, 
          nrow = 2,
          scale = 0.9)
ggsave("fig_A9.pdf",
       width = 11.5, height = 11.5)




# Create stm's

# Eighth
# input selected number of topics here:
k_eight = 19


# filter out documents that were dropped when making the STM
dfm_eighth_topics_stm <- dfm_subset(dfm_eighth_topics, docname_ != "Citizens' Assembly 2017-02-04 2.9")

dfm_eighth_topics_stm_obj <- convert(dfm_eighth_topics_stm, 
                                     to = "stm")

# generate topics
set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_eighth <- stm(dfm_eighth_topics_stm_obj$documents,
                  dfm_eighth_topics_stm_obj$vocab,
                  K = k_eight, 
                  verbose = FALSE, max.em.its = 50)



# Referenda
# input selected number of topics here:
k_referenda <- 8

dfm_referenda_topics_stm_obj <- dfm_referenda_topics_stm

# generate topics
set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_referenda <- stm(dfm_referenda_topics_stm_obj$documents,
                     dfm_referenda_topics_stm_obj$vocab,
                     K = k_referenda,
                     verbose = FALSE, max.em.its = 50)


# Ageing
# input selected number of topics here:
k_ageing <- 10

dfm_ageing_topics_stm_obj <- dfm_ageing_topics_stm

# generate topics
set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_ageing <- stm(dfm_ageing_topics_stm_obj$documents,
                  dfm_ageing_topics_stm_obj$vocab,
                  K = k_ageing, 
                  verbose = FALSE, max.em.its = 50)


# Climate
# input selected number of topics here:
k_climate <- 12


# filter out documents that were dropped when making the STM
dfm_climate_topics_stm <- dfm_subset(dfm_climate_topics, 
                                     !(docname_ %in%  c("Citizens' Assembly 2017-10-01 15", "Citizens' Assembly 2017-11-04 6", "Citizens' Assembly 2017-11-05 1")))


dfm_climate_topics_stm_obj <- convert(dfm_climate_topics_stm, 
                                      to = "stm")

# generate topics
set.seed(123, kind = "Mersenne-Twister", normal.kind = "Inversion")

stm_climate <- stm(dfm_climate_topics_stm_obj$documents,
                   dfm_climate_topics_stm_obj$vocab,
                   K = k_climate,
                   verbose = FALSE, max.em.its = 50)


topic_test <- function(x){
  ifelse(str_detect(x, "Topic") == TRUE, sum(x, na.rm = TRUE), max(x))
  
}


# Eighth

stm_table_eighth <- make.dt(stm_eighth, docvars(dfm_eighth_topics_stm))

stm_table_eighth <- stm_table_eighth %>% 
  select(-docnum) %>% 
  mutate(prop_of_text = chunk_length / session_length) %>% 
  mutate(across(starts_with("Topic"), ~ .x * prop_of_text)) %>% 
  group_by(order) %>% 
  summarise(across(starts_with("Topic"), ~ sum(., na.rm = TRUE)), 
            across(!(starts_with("Topic")), ~ max(.)))


write.csv(stm_table_eighth, "stm_table_eighth_chunks.csv",
          fileEncoding = "UTF-8")

# Referenda

stm_table_referenda <- make.dt(stm_referenda, docvars(dfm_referenda_topics))

stm_table_referenda <- stm_table_referenda %>% 
  select(-docnum) %>% 
  mutate(prop_of_text = chunk_length / session_length) %>% 
  mutate(across(starts_with("Topic"), ~ .x * prop_of_text)) %>% 
  group_by(order) %>% 
  summarise(across(starts_with("Topic"), ~ sum(., na.rm = TRUE)), 
            across(!(starts_with("Topic")), ~ max(.)))


write.csv(stm_table_referenda, "stm_table_referenda_chunks.csv",
          fileEncoding = "UTF-8")

# Ageing

stm_table_ageing <- make.dt(stm_ageing, docvars(dfm_ageing_topics))

stm_table_ageing <- stm_table_ageing %>% 
  select(-docnum) %>% 
  mutate(prop_of_text = chunk_length / session_length) %>% 
  mutate(across(starts_with("Topic"), ~ .x * prop_of_text)) %>% 
  group_by(order) %>% 
  summarise(across(starts_with("Topic"), ~ sum(., na.rm = TRUE)), 
            across(!(starts_with("Topic")), ~ max(.)))


write.csv(stm_table_ageing, "stm_table_ageing_chunks.csv",
          fileEncoding = "UTF-8")

# Climate

stm_table_climate <- make.dt(stm_climate, docvars(dfm_climate_topics_stm))

stm_table_climate <- stm_table_climate %>% 
  select(-docnum) %>% 
  mutate(prop_of_text = chunk_length / session_length) %>% 
  mutate(across(starts_with("Topic"), ~ .x * prop_of_text)) %>% 
  group_by(order) %>% 
  summarise(across(starts_with("Topic"), ~ sum(., na.rm = TRUE)), 
            across(!(starts_with("Topic")), ~ max(.)))


write.csv(stm_table_climate, "stm_table_climate_chunks.csv",
          fileEncoding = "UTF-8")



# Tidying Data ----

# *Calculating next 5 item topic proportions*
# Eighth


# set threshold
t = 0
K = 19

topic_table <- stm_table_eighth

# This renames the topics so that it's easier to move the table to a tidier format later on
topic_table <- topic_table %>% 
  rename(
    topic_prop.topic_1 = Topic1,
    topic_prop.topic_2 = Topic2,
    topic_prop.topic_3 = Topic3,
    topic_prop.topic_4 = Topic4,
    topic_prop.topic_5 = Topic5,
    topic_prop.topic_6 = Topic6,
    topic_prop.topic_7 = Topic7,
    topic_prop.topic_8 = Topic8,
    topic_prop.topic_9 = Topic9,
    topic_prop.topic_10 = Topic10,
    topic_prop.topic_11 = Topic11,
    topic_prop.topic_12 = Topic12,
    topic_prop.topic_13 = Topic13,
    topic_prop.topic_14 = Topic14,
    topic_prop.topic_15 = Topic15,
    topic_prop.topic_16 = Topic16,
    topic_prop.topic_17 = Topic17,
    topic_prop.topic_18 = Topic18,
    topic_prop.topic_19 = Topic19
  )

# the code below cycles through every document (rows) and every Sub-Topic (Columns) to see if the document mentions a topic (if its above the threshold - currently set to 0) and if it does it then creates a new column for each of the next 5 documents giving the topic prevalence of those documents but only if they are Q&A sessions. 

# The code basically says: Create a new column with the column name indicating whether its the next video or the video after that etc. and the topic number <- If it's not the last video check if the expert's mentions the topic enough to push it over the threshold, if they do, check if the next video is a Q&A session, if it is paste in the value from that Q&A session, if not don't paste anything. Repeat for the next 4 agenda items.

for(i in 1:nrow(topic_table)){
  for(j in 1:K){
    # Next session
    topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")),NA)
    
    # Two sessions later
    topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Three sessions later
    topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Four sessions later
    topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Five sessions later
    topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
  }
}

topic_table_eighth <- topic_table

# Referenda

# set threshold
t = 0
K=8

topic_table <- stm_table_referenda

topic_table <- topic_table %>% 
  rename(
    topic_prop.topic_1 = Topic1,
    topic_prop.topic_2 = Topic2,
    topic_prop.topic_3 = Topic3,
    topic_prop.topic_4 = Topic4,
    topic_prop.topic_5 = Topic5,
    topic_prop.topic_6 = Topic6,
    topic_prop.topic_7 = Topic7, 
    topic_prop.topic_8 = Topic8
  )

for(i in 1:nrow(topic_table)){
  for(j in 1:K){
    # Next session
    topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")),NA)
    
    # Two sessions later
    topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Three sessions later
    topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Four sessions later
    topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Five sessions later
    topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
  }
}

topic_table_referenda <- topic_table

# Ageing

# set threshold
t = 0
K = 10


topic_table <- stm_table_ageing

topic_table <- topic_table %>% 
  rename(
    topic_prop.topic_1 = Topic1,
    topic_prop.topic_2 = Topic2,
    topic_prop.topic_3 = Topic3,
    topic_prop.topic_4 = Topic4,
    topic_prop.topic_5 = Topic5,
    topic_prop.topic_6 = Topic6,
    topic_prop.topic_7 = Topic7,
    topic_prop.topic_8 = Topic8,
    topic_prop.topic_9 = Topic9,
    topic_prop.topic_10 = Topic10
  )

for(i in 1:nrow(topic_table)){
  for(j in 1:K){
    # Next session
    topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")),NA)
    
    # Two sessions later
    topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Three sessions later
    topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Four sessions later
    topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Five sessions later
    topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
  }
}

topic_table_ageing <- topic_table

# Climate

t = 0
K = 12


topic_table <- stm_table_climate

topic_table <- topic_table %>% 
  rename(
    topic_prop.topic_1 = Topic1,
    topic_prop.topic_2 = Topic2,
    topic_prop.topic_3 = Topic3,
    topic_prop.topic_4 = Topic4,
    topic_prop.topic_5 = Topic5,
    topic_prop.topic_6 = Topic6,
    topic_prop.topic_7 = Topic7,
    topic_prop.topic_8 = Topic8,
    topic_prop.topic_9 = Topic9,
    topic_prop.topic_10 = Topic10,
    topic_prop.topic_11 = Topic11,
    topic_prop.topic_12 = Topic12
  )


for(i in 1:nrow(topic_table)){
  for(j in 1:K){
    # Next session
    topic_table[i, paste("next_agenda_item_1.topic_", j, sep = '')] <- ifelse(i != nrow(topic_table),
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+1] == "Q&A", as.character(topic_table[i+1, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")),NA)
    
    # Two sessions later
    topic_table[i, paste("next_agenda_item_2.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 1), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+2] == "Q&A", as.character(topic_table[i+2, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Three sessions later
    topic_table[i, paste("next_agenda_item_3.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 2), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+3] == "Q&A", as.character(topic_table[i+3, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Four sessions later
    topic_table[i, paste("next_agenda_item_4.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 3), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+4] == "Q&A", as.character(topic_table[i+4, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
    # Five sessions later
    topic_table[i, paste("next_agenda_item_5.topic_", j, sep = '')] <- ifelse(i < (nrow(topic_table) - 4), 
                                                                              (ifelse(topic_table[i, paste("topic_prop.topic_", (as.character(j)), sep = ''), with = FALSE] > t, 
                                                                                      (ifelse(topic_table$session_type[i+5] == "Q&A", as.character(topic_table[i+5, paste("topic_prop.topic_", j, sep = ''), with = FALSE]), "")), 
                                                                                      "TOPIC NOT MENTIONED")), NA)
    
  }
}

topic_table_climate <- topic_table



# Pivot the table into tidy format  

# Eighth

# Tidy the table so that each row represents a specific document and topic. 
tidy_table_eighth <- topic_table_eighth %>% 
  pivot_longer(names_to = c(".value", "topic"),
               names_sep = "\\.",
               names_repair = "unique",
               cols = na.omit(str_extract(names(topic_table_eighth),
                                          "[:graph:]+topic[:graph:]+")))

# Referenda

tidy_table_referenda <- topic_table_referenda %>% 
  pivot_longer(names_to = c(".value", "topic"),
               names_sep = "\\.",
               names_repair = "unique",
               cols = na.omit(str_extract(names(topic_table_referenda),
                                          "[:graph:]+topic[:graph:]+")))

# Ageing

tidy_table_ageing <- topic_table_ageing %>% 
  pivot_longer(names_to = c(".value", "topic"),
               names_sep = "\\.",
               names_repair = "unique",
               cols = na.omit(str_extract(names(topic_table_ageing),
                                          "[:graph:]+topic[:graph:]+")))

# Climate

tidy_table_climate <- topic_table_climate %>% 
  pivot_longer(names_to = c(".value", "topic"),
               names_sep = "\\.",
               names_repair = "unique",
               cols = na.omit(str_extract(names(topic_table_climate),
                                          "[:graph:]+topic[:graph:]+")))


# Clean the data & calculate the maximum value in Q&A sessions


# Create date lookup table
lookup_date <- dat_coded$date
names(lookup_date) <- dat_coded$order

# Eighth

tidy_table_eighth <- tidy_table_eighth %>% 
  filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
  filter(is.na(next_agenda_item_1) == FALSE) %>% 
  mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
  mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
  mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
  mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
  mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>% 
  mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
  mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
  mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
  mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
  mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
  mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
  mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
  mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
  mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, next_agenda_item_4)) %>% 
  mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, next_agenda_item_5)) %>% 
  filter(is.na(next_agenda_item_1) == FALSE | 
           is.na(next_agenda_item_2) == FALSE |
           is.na(next_agenda_item_3) == FALSE |
           is.na(next_agenda_item_4) == FALSE |
           is.na(next_agenda_item_5) == FALSE) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))

# Refernda

tidy_table_referenda <- tidy_table_referenda %>% 
  filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
  filter(is.na(next_agenda_item_1) == FALSE) %>% 
  mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
  mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
  mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
  mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
  mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>% 
  mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
  mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
  mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
  mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
  mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
  mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
  mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
  mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
  mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, NA)) %>% # removing days 4 & 5 because of agenda
  mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, NA)) %>% 
  filter(is.na(next_agenda_item_1) == FALSE | 
           is.na(next_agenda_item_2) == FALSE |
           is.na(next_agenda_item_3) == FALSE |
           is.na(next_agenda_item_4) == FALSE |
           is.na(next_agenda_item_5) == FALSE) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))

# Ageing

tidy_table_ageing <- tidy_table_ageing %>% 
  filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
  filter(is.na(next_agenda_item_1) == FALSE) %>% 
  mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
  mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
  mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
  mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
  mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>%
  mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
  mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
  mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
  mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
  mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
  mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
  mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
  mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
  mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, next_agenda_item_4)) %>% 
  mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, next_agenda_item_5)) %>% 
  filter(is.na(next_agenda_item_1) == FALSE | 
           is.na(next_agenda_item_2) == FALSE |
           is.na(next_agenda_item_3) == FALSE |
           is.na(next_agenda_item_4) == FALSE |
           is.na(next_agenda_item_5) == FALSE) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))

# Climate

tidy_table_climate <- tidy_table_climate %>% 
  filter(next_agenda_item_1 != "TOPIC NOT MENTIONED") %>% 
  filter(is.na(next_agenda_item_1) == FALSE) %>% 
  mutate(next_agenda_item_1 = as.numeric(next_agenda_item_1)) %>% 
  mutate(next_agenda_item_2 = as.numeric(next_agenda_item_2)) %>%
  mutate(next_agenda_item_3 = as.numeric(next_agenda_item_3)) %>%
  mutate(next_agenda_item_4 = as.numeric(next_agenda_item_4)) %>%
  mutate(next_agenda_item_5 = as.numeric(next_agenda_item_5)) %>%
  mutate(next_agenda_item_1_days = unname(lookup_date[(as.character((order) + (1)))]) - date) %>% 
  mutate(next_agenda_item_2_days = unname(lookup_date[(as.character((order) + (2)))]) - date) %>%
  mutate(next_agenda_item_3_days = unname(lookup_date[(as.character((order) + (3)))]) - date) %>%
  mutate(next_agenda_item_4_days = unname(lookup_date[(as.character((order) + (4)))]) - date) %>%
  mutate(next_agenda_item_5_days = unname(lookup_date[(as.character((order) + (5)))]) - date) %>%
  mutate(next_agenda_item_1 = ifelse(next_agenda_item_1_days > 2, NA, next_agenda_item_1)) %>%
  mutate(next_agenda_item_2 = ifelse(next_agenda_item_2_days > 2, NA, next_agenda_item_2)) %>% 
  mutate(next_agenda_item_3 = ifelse(next_agenda_item_3_days > 2, NA, next_agenda_item_3)) %>% 
  mutate(next_agenda_item_4 = ifelse(next_agenda_item_4_days > 2, NA, next_agenda_item_4)) %>% 
  mutate(next_agenda_item_5 = ifelse(next_agenda_item_5_days > 2, NA, next_agenda_item_5)) %>% 
  filter(is.na(next_agenda_item_1) == FALSE | 
           is.na(next_agenda_item_2) == FALSE |
           is.na(next_agenda_item_3) == FALSE |
           is.na(next_agenda_item_4) == FALSE |
           is.na(next_agenda_item_5) == FALSE) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_max = max(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))


# calculating control variables for regression 

# Find the order num of the chosen Q&A Session

# Eighth

tidy_table_eighth <- tidy_table_eighth %>% 
  mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                             ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                    ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                           ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))

# Referenda

tidy_table_referenda <- tidy_table_referenda %>% 
  mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                             ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                    ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                           ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))

# Ageing

tidy_table_ageing <- tidy_table_ageing  %>% 
  mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                             ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                    ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                           ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))

# Climate

tidy_table_climate <- tidy_table_climate %>% 
  mutate(doc_num_qa = ifelse(next_agenda_item_1 == qa_topic_prop_max & !is.na(next_agenda_item_1 == qa_topic_prop_max), order + 1,
                             ifelse(next_agenda_item_2 == qa_topic_prop_max & !is.na(next_agenda_item_2 == qa_topic_prop_max), order + 2, 
                                    ifelse(next_agenda_item_3 == qa_topic_prop_max & !is.na(next_agenda_item_3 == qa_topic_prop_max), order + 3, 
                                           ifelse(next_agenda_item_4 == qa_topic_prop_max & !is.na(next_agenda_item_4 == qa_topic_prop_max), order + 4, order + 5)))))


# # create lookup tables

# Create date lookup table
lookup_date <- dat_coded$date
names(lookup_date) <- dat_coded$order

# Lookup table for length of Q&A
lookup_length <- docvars(corp, "session_length")
names(lookup_length) <- docvars(corp, "order")


# Eighth

# Lookup the date
tidy_table_eighth <- tidy_table_eighth %>% 
  mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_eighth <- tidy_table_eighth %>% 
  mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
  mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Referenda

# Lookup the date
tidy_table_referenda <- tidy_table_referenda %>% 
  mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_referenda <- tidy_table_referenda %>% 
  mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
  mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Ageing

# Lookup the date
tidy_table_ageing <- tidy_table_ageing %>% 
  mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_ageing <- tidy_table_ageing %>% 
  mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
  mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Climate

# Lookup the date
tidy_table_climate <- tidy_table_climate %>% 
  mutate(num_days = unname(lookup_date[(as.character(doc_num_qa))]) - date)

# Lookup the length
tidy_table_climate <- tidy_table_climate %>% 
  mutate(duration_qa = unname(lookup_length[(as.character(doc_num_qa))])) %>% 
  mutate(duration_expert = unname(lookup_length[(as.character(order))]))


# Export tables to CSV for later use

# Eighth
write.csv(tidy_table_eighth, "eighth_chunks.csv",
          fileEncoding = "UTF-8")

# Referenda

write.csv(tidy_table_referenda, "referenda_chunks.csv",
          fileEncoding = "UTF-8")

# Ageing

write.csv(tidy_table_ageing, "ageing_chunks.csv",
          fileEncoding = "UTF-8")

# Climate

write.csv(tidy_table_climate, "climate_chunks.csv",
          fileEncoding = "UTF-8")

# Figure A10 ----

# Import datasets

# Eighth
eighth <- read.csv("eighth_chunks.csv", encoding = "UTF-8") %>% 
  mutate(num_docs = doc_num_qa - order) %>% 
  mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
  mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
  mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
  filter(as.numeric(order) < 180) %>% 
  ungroup()

# Referenda
referenda <- read.csv("referenda_chunks.csv", encoding = "UTF-8") %>% 
  mutate(num_docs = doc_num_qa - order) %>%
  mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
  mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
  mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))%>% 
  ungroup()

# Ageing
ageing_population <- read.csv("ageing_chunks.csv", encoding = "UTF-8")%>% 
  mutate(num_docs = doc_num_qa - order) %>%
  mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
  mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>%  
  rowwise() %>% 
  mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
  mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))%>% 
  ungroup()

# Climate
climate_change <- read.csv("climate_chunks.csv", encoding = "UTF-8") %>%
  mutate(num_docs = doc_num_qa - order) %>%
  mutate(speaker = ifelse(is.na(speaker), "Other", speaker)) %>% 
  mutate(position_short = ifelse(is.na(position_short), "Other", position_short)) %>% 
  rowwise() %>% 
  mutate(qa_topic_prop_sum = sum(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE)) %>% 
  mutate(qa_topic_prop_avg = mean(c_across(next_agenda_item_1:next_agenda_item_5), na.rm = TRUE))%>% 
  ungroup()

# Combine datasets

dat_da <- rbind(eighth, 
                ageing_population, 
                referenda,
                climate_change,
                stringsAsFactors = FALSE)


write_csv(dat_da, "data_analysis_chunks.csv")


dat_da$position_short <- str_replace_all(dat_da$position_short, "Expert :", "Expert:")

# Obtain Summary Statistics

summary_table <- dat_da %>% 
  group_by(group) %>% 
  summarise(count = n(), mean_length = mean(session_length), median_length = median(session_length), sd_length = sd(session_length), min_length = min(session_length), max_length = max(session_length))

summary_table2 <- dat_da %>% 
  summarise(count = n(), mean_length = mean(session_length), median_length = median(session_length), sd_length = sd(session_length), min_length = min(session_length), max_length = max(session_length))

count_docs <- distinct(dat_da, order, group) %>% 
  group_by(group) %>% 
  summarise(count = n())

count_distinct <- dat_da %>% 
  group_by(group) %>% 
  distinct(order) %>% 
  summarise(count = n())

count_distinct_speaker <- dat_da %>% 
  distinct(speaker) %>% 
  summarise(count = n())

# Plot showing relationship between topic proportion 
# of expert speech and maximum following Q&A


# add additional variables

dat_da <- dat_da %>% 
  mutate(group = as.factor(group),
         session_type = as.factor(session_type),
         topic_prop_log = log(topic_prop * 100)) %>% 
  as.data.frame() %>% 
  mutate(topic_group = paste0(topic, group))


# Generate discrete buckets for topic proportions

dat_da <- dat_da %>% 
  group_by(topic_group) %>% 
  mutate(topic_prop_discrete = case_when(topic_prop > quantile(topic_prop, 2/3) ~ "High",
                                         topic_prop > quantile(topic_prop, 1/3) ~ "Medium",
                                         TRUE ~ "Low"))

# Arrange the topic proportion discrete buckets

dat_da$topic_prop_discrete <- factor(dat_da$topic_prop_discrete,
                                     levels = c("Low", 
                                                "Medium",
                                                "High"))

# Add additional descriptive variables

dat_da <- dat_da %>% 
  group_by(date, speaker, group) %>% 
  mutate(min_topic = min(topic_prop),
         max_topic = max(topic_prop),
         range_topics = max_topic - min_topic,
         sd_topics = sd(topic_prop))


# Organise session type levels

dat_da$session_type <- factor(dat_da$session_type,
                              levels = c("Other", 
                                         "Q&A",
                                         "Expert"))


# Fractional logistic regression model with maximum value

table(dat_da$session_type)

dat_da <- dat_da %>% 
  mutate(session_type = ifelse(session_type == "Expert", "Expert Input", "Other Agenda Item"))

dat_da$session_type <- forcats::fct_rev(dat_da$session_type)

table(dat_da$session_type)

glm_max_continuous <- glm(qa_topic_prop_max ~
                            topic_prop * session_type +
                            duration_expert +
                            duration_qa +
                            group +  
                            topic_group +
                            num_docs, 
                          data = dat_da, 
                          family = quasibinomial('logit'))


# Fractional logistic regression model with mean value

glm_avg_continuous <- glm(qa_topic_prop_avg ~
                            topic_prop * session_type +
                            duration_expert +
                            duration_qa +
                            group +  
                            topic_group +
                            num_docs, 
                          data = dat_da, 
                          family = quasibinomial('logit'))



# Plot predicted values
p_cont_max <- ggpredict(glm_max_continuous, 
                        terms = c("topic_prop [all]", "session_type")) 



# Plot predicted values
p_cont_avg <- ggpredict(glm_avg_continuous, 
                        terms = c("topic_prop [all]", "session_type")) 


plot_continuous(x = p_cont_max) +
  scale_y_continuous(limits = c(0, 0.6),
                     breaks = c(seq(0, 0.8, 0.2))) +
  geom_rug(aes(x = topic_prop), data = dat_da,
           inherit.aes = FALSE, colour = "grey20") 
ggsave("fig_A10a.pdf",
       width = 5, height = 6)


# regression model with discrete independent variable

glm_max_discrete <- glm(qa_topic_prop_max ~
                          topic_prop_discrete * session_type +
                          duration_expert +
                          duration_qa +
                          group + 
                          topic_group +
                          num_docs, 
                        data = dat_da, 
                        family = quasibinomial('logit'))

 


# regression model (average Topic Prevalence) 
# with discrete independent variable

glm_avg_discrete <- glm(qa_topic_prop_avg ~
                          topic_prop_discrete * session_type +
                          duration_expert +
                          duration_qa +
                          group +
                          topic_group +
                          num_docs, 
                        data = dat_da, 
                        family = quasibinomial('logit'))


# plot discrete regression model predicted values

p_discrete_max <- ggpredict(glm_max_discrete, 
                            terms = c("topic_prop_discrete", "session_type")) 


p_discrete_avg <- ggpredict(glm_avg_discrete, 
                            terms = c("topic_prop_discrete", "session_type")) 


plot_discrete(x = p_discrete_max) +
  scale_y_continuous(limits = c(0, 0.4),
                     breaks = c(seq(0, 0.4, 0.1)))
ggsave("fig_A10b.pdf",
       width = 5, height = 6)


# Table A4 ----

texreg(list(glm_max_continuous,
            glm_max_discrete),
       omit.coef = c("(Intercept)|topic_group*|group*|duration_*|num_docs*"),
       fontsize = "footnotesize",
       include.aic = FALSE,
       include.bic = FALSE,
       include.loglik = FALSE,
       caption.above = TRUE,
       custom.coef.map = list(
         "topic_prop" = "Topic Prevalence",
         "session_typeExpert Input" = "Expert Input",
         "topic_prop:session_typeExpert Input" = "Topic Prevalence $\\times$ Expert Input",
         "topic_prop_discreteMedium" = "Topic Prevalence (Discrete): Medium",
         "topic_prop_discreteHigh" = "Topic Prevalence (Discrete): High",
         "topic_prop_discreteMedium:session_typeExpert Input" = "Topic Prevalence (Discrete): Medium $\\times$ Expert Input",
         "topic_prop_discreteHigh:session_typeExpert Input" = "Topic Prevalence (Discrete): High $\\times$ Expert Input"
       ),
       caption = "Predicting issue emphasis in Q\\&A sessions (based on corpus split into chunks)",
       label = "tab:main_proc_chunks",
       custom.gof.rows = list("Controls" = c(rep("\\checkmark", 2)),
                              "Fixed Effects: 4 Issues" = c(rep("\\checkmark", 2)),
                              "Fixed Effects: Topics" = c(rep("\\checkmark", 2))),
       file = "tab_A4.tex")
